#Proteomics analysis at the individual protein level
setwd('/Users/emmatimminsschiffman/Documents/Dissertation/proteomics/DB post-genome/OT expression comparison')

speccounts<-read.csv('all samples spec counts.csv', header=T, row.names=1)

#look at frequency of occurrence
foa.plots(speccounts)

#screen data for insufficient variables
#how many proteins are expressed in only half of the oysters?
testdata<-drop.var(speccounts, min.fo=8)
str(testdata)
#35
#which proteins occur in over 95% of the oysters?
testdata2<-drop.var(speccounts, max.po=95)
str(testdata2)
#779
#how many proteins have little variation?
testdata3<-drop.var(speccounts, min.cv=5)
str(testdata3)
#0

#get rid of the rare (expressed in less than half the oysters) and super abundant (expressed in most of the oysters) proteins
speccounts2<-drop.var(speccounts,min.fo=8)
speccounts3<-drop.var(speccounts2, max.po=95)
#this leaves 427 proteins

#log(x+1) transformation - need the +1 for 0 values
speccounts.1<-(speccounts3+1)
speccounts.trans<-data.trans(speccounts.1, method='log', plot=F)

#bray curtis matrix
speccounts.bc<-vegdist(speccounts.trans, method='bray')
speccounts.bc

#NMDS
speccounts.nmds<-metaMDS(speccounts.trans, distance='bray', k=2, trymax=100, autotransform=F)
#stress is low, so 2 dimensions is good
nmds.scree(speccounts.trans, distance='bray', k=10, autotransform=F, trymax=20)
plot(speccounts.nmds, type='n')
text(speccounts.nmds, labels=row.names(speccounts.trans))
#calculate loadings on each axis
vec.spec<-envfit(speccounts.nmds$points, speccounts.trans, perm=100)
#*0.05=10008811, 10013314, 10007261, 10025805, 10026987, 10007643, 10003923, 10021998, 10008240, 10020454, 10014801, 1005085, 10012796, 1004243, 10020479, 10006472, 10027362, 10012406, 10010975, 10027546, 10020770, 10001845, 10024853, 10000055, 10004733, 10015183, 10006979, 10023886, 10005508, 10008200, 10025760, 10008961, 10023712, 10006329, 10022421, 10000994, 10000033, 10026642, 10028023, 10002631, 10000067, 10007636, 10012096, 10022619, 10015552, 10016641
#**0.01=10014956, 10000591, 10018703, 10010841, 10011108, 10016815, 10002017, 10016941, 10006212, 10006511, 10014919, 10021336, 10016539, 10019610
ordiplot(speccounts.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.spec, p.max=0.01, col='blue', cex=0.5)
ordihull(speccounts.nmds, treatment[,3], label=T)

#repeat the above except without removing high or low abundance proteins
spec.plus1<-(speccounts+1)
spec.trans<-data.trans(spec.plus1, method='log', plot=F)
speccounts2.nmds<-metaMDS(spec.trans, distance='bray', k=2, trymax=100, autotransform=F)
#low stress
ordiplot(speccounts2.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
ordihull(speccounts2.nmds, treatment[,3], label=T)
vec2.spec<-envfit(speccounts2.nmds$points, spec.trans, perm=100)
#output saved as spreadsheet, all proteins NMDS loadings 120712
plot(vec2.spec, p.max=0.01, col='blue', cex=0.5)

#only low abundance proteins removed
speccounts2.1<-(speccounts2+1)
speccounts2.trans<-data.trans(speccounts2.1, method='log', plot=F)
speccounts3.nmds<-metaMDS(speccounts2.trans, distance='bray', k=2, trymax=100, autotransform=F)
ordiplot(speccounts3.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
ordihull(speccounts3.nmds, treatment[,3], label=F, col='grey47', cex=0.75)
vec3.spec<-envfit(speccounts3.nmds$points, speccounts2.trans, perm=100)
#output saved as no low abundance NMDS loadings 120712
plot(vec3.spec, p.max=0.01, col='blue', cex=0.5)

#low abundance removed and compare just high and low pCO2, no MS
spec2.sub<-read.csv('CO2 only samples spec counts.csv', header=T, row.names=1)
spec2sub.nmds<-metaMDS(spec2.sub, distance='bray', k=2, trymax=100, autotransform=F)
treat.sub<-read.csv('CO2 treatment.csv', header=T, row.names=1)
ordiplot(spec2sub.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
ordihull(spec2sub.nmds, treat.sub[,1], label=F, col='grey47')

specsub.row<-data.stand(spec2.sub, method='total', margin='row', plot=F)
specsub.d<-vegdist(specsub.row, 'bray')
co2.anos<-anosim(specsub.d, grouping=treat.sub[,1])
summary(co2.anos)
#R=0.01042, p=0.344

#only high abundance proteins removed
speccounts4<-drop.var(speccounts, max.po=95)
speccounts4.1<-(speccounts4+1)
speccounts4.trans<-data.trans(speccounts4.1, method='log', plot=F)
speccounts4.nmds<-metaMDS(speccounts4.trans, distance='bray', k=2, trymax=100, autotransform=F)
ordiplot(speccounts4.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
vec4.spec<-envfit(speccounts4.nmds$points, speccounts4.trans, perm=100)
#output saved as no high abundance NMDS loadings 120712
plot(vec4.spec, p.max=0.01, col='blue', cex=0.5)
ordihull(speccounts4.nmds, treatment[,3], label=T)

#ANOSIM
#read in file of individuals assigned to treatment groups
treatment<-read.csv('treatment groupings.csv', header=T, row.names=1)
#normalize dataset by row
spec.row<-data.stand(speccounts, method='total', margin='row', plot=F)
#create dissimilarity matrix using Bray-Curtis coefficient
spec.d<-vegdist(spec.row, 'bray')

#anosim for pCO2 grouping
y.anosim1<-anosim(spec.d, grouping=treatment[,1])
summary(y.anosim1)
#R=-0.01786, p=0.571
plot.anosim(y.anosim1)

#anosim for MS grouping
y.anosim2<-anosim(spec.d, grouping=treatment[,2])
summary(y.anosim2)
#R=-0.024, p=0.563
plot.anosim(y.anosim2)

#anosim for all treatments
y.anosim3<-anosim(spec.d, treatment[,3])
summary(y.anosim3)
#R=-0.01302, p=0.556
plot.anosim(y.anosim3)

#ANOSIM with low abundance proteins removed
spec2.row<-data.stand(speccounts2, method='total', margin='row', plot=F)
spec2.d<-vegdist(spec2.row, 'bray')
spec2.anos<-anosim(spec2.d, grouping=treatment[,3])
summary(spec2.anos)
#R=-0.01823, p=0.57

#PROTEOMICS ANALYSIS AT GO LEVEL
#BIOLOGICAL PROCESSES
spec.go<-read.csv('GO processes spec counts.csv', header=T, row.names=1)
#log(x+1) transformation - need the +1 for 0 values
spec.go.1<-(spec.go+1)
specgo.trans<-data.trans(spec.go.1, method='log', plot=F)

#NMDS
specgo.nmds<-metaMDS(specgo.trans, distance='bray', k=2, trymax=100, autotransform=F)
#stress is low, so 2 dimensions is good
nmds.scree(specgo.trans, distance='bray', k=10, autotransform=F, trymax=20)
plot(specgo.nmds, type='n')
text(specgo.nmds, labels=row.names(specgo.trans))
#calculate loadings on each axis
vec.specgo<-envfit(specgo.nmds$points, specgo.trans, perm=100)
#saved as sheet in workbook

ordiplot(specgo.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.specgo, p.max=0.01, col='blue', cex=0.5)
ordihull(specgo.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue'))

#anosim
specgo.row<-data.stand(spec.go, method='total', margin='row', plot=F)
specgo.d<-vegdist(specgo.row, 'bray')
#pCO2
go.anosim1<-anosim(specgo.d, treatment[,1])
summary(go.anosim1)
#R=0.1473, p=0.029
#MS
go.anosim2<-anosim(specgo.d, treatment[,2])
summary(go.anosim2)
#R=-0.0106, p=0.504
#both
go.anosim3<-anosim(specgo.d, treatment[,3])
summary(go.anosim3)
#R=0.1562, p=0.05
plot.anosim(go.anosim1)
plot.anosim(go.anosim3)

#CELLULAR COMPONENTS
cc.go<-read.csv('GO components spec counts.csv', header=T, row.names=1)
cc.go.1<-(cc.go+1)
cc.go.trans<-data.trans(cc.go.1, method='log', plot=F)

#NMDS
ccgo.nmds<-metaMDS(cc.go.trans, distance='bray', k=2, trymax=100, autotransform=F)
vec.ccgo<-envfit(ccgo.nmds$points, cc.go.trans, perm=100)
ordiplot(ccgo.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.ccgo, p.max=0.01, col='blue', cex=0.5)
ordihull(ccgo.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue'))

#anosim
ccgo.row<-data.stand(cc.go, method='total', margin='row', plot=F)
ccgo.d<-vegdist(ccgo.row, 'bray')
#pCO2
cc.anosim1<-anosim(ccgo.d, treatment[,1])
summary(cc.anosim1)
#R=0.000558, p=0.424
#MS
cc.anosim2<-anosim(ccgo.d, treatment[,2])
summary(cc.anosim2)
#R=0.007254, p=0.378
#both
cc.anosim3<-anosim(ccgo.d, treatment[,3])
summary(cc.anosim3)
#R=0.04861, p=0.248

#MOLECULAR FUNCTION
mf.go<-read.csv('GO function spec counts.csv', header=T, row.names=1)
mf.go.1<-(mf.go+1)
mf.go.trans<-data.trans(mf.go.1, method='log', plot=F)

#NMDS
mfgo.nmds<-metaMDS(mf.go.trans, distance='bray', k=2, trymax=100, autotransform=F)
vec.mfgo<-envfit(mfgo.nmds$points, mf.go.trans, perm=100)
ordiplot(mfgo.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.mfgo, p.max=0.01, col='blue', cex=0.5)
ordihull(mfgo.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue'))

#anosim
mfgo.row<-data.stand(mf.go, method='total', margin='row', plot=F)
mfgo.d<-vegdist(mfgo.row, 'bray')
#pCO2
mf.anosim1<-anosim(mfgo.d, treatment[,1])
summary(mf.anosim1)
#R=-0.04129, p=0.749
#MS
mf.anosim2<-anosim(mfgo.d, treatment[,2])
summary(mf.anosim2)
#R=0.02455, p=0.338
#both
mf.anosim3<-anosim(mfgo.d, treatment[,3])
summary(mf.anosim3)
#R=0.02865, p=0.333

#PROTEOMICS ANALYSIS AT GO SLIM LEVEL
#BIOLOGICAL PROCESSES
bpslim<-read.csv('GO Slim processes spec counts.csv', header=T, row.names=1)
bpslim.trans<-data.trans(bpslim, method='log', plot=F)

#NMDS
bpslim.nmds<-metaMDS(bpslim.trans, distance='bray', k=2, trymax=100, autotransform=F)
vec.bpslim<-envfit(bpslim.nmds$points, bpslim.trans, perm=100)
ordiplot(bpslim.nmds, choices=c(1,2), type='text', display='sites', xlab='Axis 1', ylab='Axis 2')
plot(vec.bpslim, p.max=0.01, col='blue', cex=0.5)
ordihull(bpslim.nmds, treatment[,3], label=T, col=c('magenta', 'orange', 'green', 'blue'))

#anosim
bpslim.row<-data.stand(bpslim, method='total', margin='row', plot=F)
bpslim.d<-vegdist(bpslim.row, 'bray')
#pCO2
bpslim.anosim1<-anosim(bpslim.d, treatment[,1])
summary(bpslim.anosim1)
#R=0.1886, p=0.008
plot.anosim(bpslim.anosim1)
#MS
bpslim.anosim2<-anosim(bpslim.d, treatment[,2])
summary(bpslim.anosim2)
#R=-0.01004, p=0.49
#both
bpslim.anosim3<-anosim(bpslim.d, treatment[,3])
summary(bpslim.anosim3)
#R=0.2691, p=0.003
plot.anosim(bpslim.anosim3)